{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# #coding=utf-8\n",
    "# #读取数据\n",
    "# import  pandas as pd\n",
    "# import numpy as np\n",
    "# import scipy as sp\n",
    "\n",
    "# train_data = pd.read_csv('./data/public.train.csv')\n",
    "# test_data = pd.read_csv('./data/public.test.csv')\n",
    "# submit = pd.read_csv('./data/submit_example.csv')\n",
    "# y = train_data['发电量']\n",
    "# X = train_data.drop(['ID','发电量'], axis=1)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### train数据 头信息，信息描述"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#train_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#train_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#train_data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### test 数据 头信息，信息描述"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### submit 文件展示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#submit.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_data.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 按照葡萄酒项目修改的版本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.998230790656465\n",
      "0.021387213541789463\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['./model_save/rf_regressor.pkl']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2. Import libraries and modules\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    " \n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import preprocessing\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "from sklearn.externals import joblib \n",
    " \n",
    "# 3. Load red wine data.\n",
    "# dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'\n",
    "# data = pd.read_csv(dataset_url, sep=';')\n",
    "\n",
    "train_data = pd.read_csv('./data/public.train.csv')\n",
    "test_data = pd.read_csv('./data/public.test.csv')\n",
    "submit = pd.read_csv('./data/submit_example.csv') \n",
    "\n",
    "# 4. Split data into training and test sets\n",
    "y = train_data['发电量']\n",
    "X = train_data.drop(['发电量','ID'], axis=1)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, \n",
    "                                                    test_size=0.2, \n",
    "                                                    random_state=123, \n",
    "                                                   )\n",
    " \n",
    "# 5. Declare data preprocessing steps\n",
    "pipeline = make_pipeline(preprocessing.StandardScaler(), \n",
    "                         RandomForestRegressor(n_estimators=100))\n",
    " \n",
    "# 6. Declare hyperparameters to tune\n",
    "hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],\n",
    "                  'randomforestregressor__max_depth': [None, 5, 3, 1]}\n",
    " \n",
    "# 7. Tune model using cross-validation pipeline\n",
    "clf = GridSearchCV(pipeline, hyperparameters, cv=10)\n",
    " \n",
    "clf.fit(X_train, y_train)\n",
    " \n",
    "# 8. Refit on the entire training set\n",
    "# No additional code needed if clf.refit == True (default is True)\n",
    " \n",
    "# 9. Evaluate model pipeline on test data\n",
    "pred = clf.predict(X_test)\n",
    "print (r2_score(y_test, pred))\n",
    "print (mean_squared_error(y_test, pred))\n",
    " \n",
    "# 10. Save model for future use\n",
    "joblib.dump(clf, './model_ave/rf_regressor.pkl')\n",
    "# To load: clf2 = joblib.load('rf_regressor.pkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 提交的版本 最后提交的时间：2018.7.26  22:00"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. Import libraries and modules\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    " \n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import preprocessing\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "from sklearn.externals import joblib \n",
    " \n",
    "# 3. Load red wine data.\n",
    "# dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'\n",
    "# data = pd.read_csv(dataset_url, sep=';')\n",
    "\n",
    "train_data = pd.read_csv('./data/public.train.csv')\n",
    "test_data = pd.read_csv('./data/public.test.csv')\n",
    "submit = pd.read_csv('./data/submit_example.csv') \n",
    "\n",
    "# 4. Split data into training and test sets\n",
    "y = train_data['发电量']\n",
    "X = train_data.drop(['ID','发电量'], axis=1)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, \n",
    "                                                    test_size=0.2, \n",
    "                                                    random_state=123, \n",
    "                                                   )\n",
    "\n",
    "scaler = preprocessing.StandardScaler().fit(X_train)\n",
    "X_train_scaled = scaler.transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)\n",
    "pipeline = make_pipeline(preprocessing.StandardScaler(), \n",
    "                         RandomForestRegressor(n_estimators=100))\n",
    "\n",
    "hyperparameters = { 'randomforestregressor__max_features' : [ 'sqrt'],\n",
    "                  'randomforestregressor__max_depth': [None]}\n",
    "\n",
    "\n",
    "# 5. Declare data preprocessing steps\n",
    "pipeline = make_pipeline(preprocessing.StandardScaler(), \n",
    "                         RandomForestRegressor(n_estimators=100))\n",
    " \n",
    "# 6. Declare hyperparameters to tune\n",
    "hyperparameters = { 'randomforestregressor__max_features' : [ 'sqrt'],\n",
    "                  'randomforestregressor__max_depth': [None]}\n",
    " \n",
    "# 7. Tune model using cross-validation pipeline\n",
    "clf = GridSearchCV(pipeline, hyperparameters, cv=10)\n",
    " \n",
    "clf.fit(X_train, y_train)\n",
    "\n",
    "df_result = pd.DataFrame()\n",
    "df_result['ID'] = list(test_data['ID'])\n",
    "test_feature = test_data.drop('ID', axis=1)\n",
    "pre = clf.predict(test_feature)\n",
    "\n",
    "df_result['Score'] = pre\n",
    "df_result.to_csv('submit/submit.csv', index=False, header=False, float_format='%.8f')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "clf.best_params_\n",
    "\n",
    "{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_result = pd.DataFrame()\n",
    "# df_result['ID'] = list(test_all['ID'])\n",
    "# df_result['Score'] = pre\n",
    "# df_result.to_csv('result/lgb_result.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 取出数据和label"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 随机森林建模&&特征重要度排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# from sklearn.ensemble import RandomForestRegressor\n",
    "# from sklearn.metrics import roc_auc_score\n",
    "# import pandas as pd\n",
    "# x = pd.read_csv(\"train.csv\")\n",
    "# y = x.pop(\"Survived\")\n",
    "# model =  RandomForestRegressor(n_estimator = 100 , oob_score = TRUE, random_state = 42)\n",
    "# model.fit(x(numeric_variable,y)\n",
    "# print \"AUC - ROC : \", roc_auc_score(y,model.oob_prediction)\n",
    " \n",
    "# AUC - ROC：0.7386\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
